#!/usr/bin/env perl

=head1 LICENSE

Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
Copyright [2016-2019] EMBL-European Bioinformatics Institute

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

     http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

=cut


=head1 CONTACT

 Please email comments or questions to the public Ensembl
 developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.

 Questions may also be sent to the Ensembl help desk at
 <http://www.ensembl.org/Help/Contact>.

=cut

## collect variant synonyms & load to db
##  - initially from PharmGKB database

use strict;
use warnings;
use HTTP::Tiny;
use Getopt::Long;
use DBI qw(:sql_types);
use Time::Piece;

use Bio::EnsEMBL::Registry;
use Bio::EnsEMBL::Variation::Source;

our $DEBUG = 0;

my ($data_file, $registry_file, $species, $source_name, $clean, $source_version, $source_url, $source_description, $host,
   $port, $user, $pass, $db_name);

GetOptions ("data_file=s"          => \$data_file,
            "species=s"            => \$species,
            "source_name=s"        => \$source_name,
            "source_url=s"         => \$source_url,
            "source_version=s"     => \$source_version,
            "source_description=s" => \$source_description,
            "registry=s"           => \$registry_file,            
            "host=s"               => \$host,
            "port=s"               => \$port,
            "user=s"               => \$user,
            "pass=s"               => \$pass,
            "db_name=s"            => \$db_name,
            "clean"                => \$clean 
            );

usage() unless defined $registry_file && defined $source_name; 

$species ||= 'homo_sapiens';


my $reg = 'Bio::EnsEMBL::Registry';
$reg->load_all($registry_file);
my $dba = $reg->get_DBAdaptor($species, 'variation'); 

## include failed variants to avoid missing any links
$dba->include_failed_variations(1);


## collect synonyms by source
if($source_name eq 'PharmGKB') { 
  my $source = get_source($species, $dba, $source_name, $source_version, $source_url, $source_description, 'germline');

  my $synonyms;

  ## collect data files if not already available
  $data_file = download_file('PharmGKB', 'rsid.zip') unless $data_file;

  ## extract synonyms from file
  $synonyms  = extract_PharmGKB($data_file);
  
  ## add synonyms to the database
  import_synonyms($synonyms, $source, $dba, $species); 
}

elsif($source_name eq 'dbSNP') {
  ### import synonyms from production database ### Access directly database because this database doesn't have adaptors 
  my $prod_dbc = Bio::EnsEMBL::DBSQL::DBConnection->new
    ( '-host'    => $host,
      '-port'    => $port, 
      '-user'    => $user,
      '-pass'    => $pass,
      '-dbname'  => $db_name, 
      '-driver'  =>  'mysql' 
    ); 

  my $sql_query = "SELECT rs_name,synonym_name,source_id FROM human_synonym"; 
  my $synonyms_output = query_synonyms_data($prod_dbc, $sql_query); 

  my %data; # synonym_name => {rs_name} & {source_id} 

  # get data from human_synonym table (in production db) 
  # some synonyms names are duplicated - from different sources 
  foreach my $result (@{$synonyms_output}){  
    $data{$result->[0].' '.$result->[1]}{rs_name} = $result->[0]; 
    $data{$result->[0].' '.$result->[1]}{source_id} = $result->[2]; 
  }

  # get all available sources from production database  
  my $sources = get_all_sources($dba, $prod_dbc);

  import_synonyms_all_sources($sources, $dba, $species, \%data);  
} 

elsif($source_name eq 'UniProt'){
  # download UniProt file if it's not provided  
  $data_file = download_file('UniProt', 'humsavar.txt') unless $data_file; 
 
  my $result = parse_uniprot($data_file); 

  # Source UniProt - variation_set table 
  my %source_uniprot = (
  "UniProt"     => {
    description => "Variants with annotations provided by UniProt",
    url         => "http://www.uniprot.org/",
    set         => "Uniprot variants",
    type        => "Variation",
    status      => 'mixed',
  }, 
  ); 

  # get source info 
  $source_description = $source_uniprot{$source_name}->{description}; 
  $source_url = $source_uniprot{$source_name}->{url}; 
  my $set = $source_uniprot{$source_name}->{set}; 
  my $object_type = $source_uniprot{$source_name}->{type}; 
  my $source_status = $source_uniprot{$source_name}->{status}; 

  # get source and returns source_id 
  my $source = get_source($species, $dba, $source_name, $source_version, $source_url, $source_description, $source_status); 
  my $source_id = $source->dbID();
  my $source_name = $source->name(); 

  my @ids; 
  my %synonym;
  
  if (exists($result->{'synonyms'})) {
    %synonym = %{$result->{'synonyms'}};
    # To get all the ids of the source (Uniprot)
    @ids = keys(%synonym);
  }

  # Add the synonyms
  my $variation_set_id = add_set_uniprot($dba,$set,$source_description);
  add_synonyms_uniprot(\%synonym, $source_name, $variation_set_id, $dba, $species); 
}

else{
    die "data source : $source_name not supported\n";
} 

=head2 download_file

collect current export from PharmGKB and UniProt site

=cut

sub download_file {
  my $source = shift; 
  my $data_file = shift;
  my $output_file; 

  my $http = HTTP::Tiny->new();
  my $url; 

  if($source eq "PharmGKB") {
    $url = "https://api.pharmgkb.org/v1/download/file/data/" . $data_file; 
  }
  elsif($source eq "UniProt") {
    $url = "https://www.uniprot.org/docs/" . $data_file; 
  }
  else {
    die "Can't download file for data source: $data_file\n"; 
  }
  
  my $response = $http->get($url); 
  die "Failed to pick up file: $data_file\n" unless $response->{success};

  open (my $out, ">" . $data_file) or die ("Failed to write data locally : $!\n");
  print $out  $response->{content};

  if($source eq "PharmGKB") {
    eval{
    `unzip rsid.zip`
    };
    die "Failed to unzip data :$@\n" unless $@ eq '';
    $output_file = "rsid.tsv"; 
  }
  else {
    $output_file = $data_file; 
  }

  return $output_file; 
} 

=head2 parse_uniprot

parse UniProt file 

=cut 

sub parse_uniprot {
  my $infile = shift;
  
  my %synonym;
    
  # Open the input file for reading
  if($infile =~ /gz$/) {
    open (IN, "zcat $infile |") or die ("Could not open $infile for reading");
  }
  else {
    open (IN,'<',$infile) or die ("Could not open $infile for reading");
  }
  
  # Read through the file and parse out the desired fields
  while (<IN>) {
    chomp;
    
    # A regexp to catch the meta information in the header. Just echo this to the stdout for logging purposes
    if ($_ =~ m/^\s*(Description|Name|Release)\:/) {
      print STDOUT $_ . "\n";
    }
    
    # Main regexp to extract relevant variation information
    if ($_ =~ m/^(\S+)\s+\S+\s+(VAR\_\d+)\s+\w\.\S+\s+(Disease|Polymorphism|Unclassified)\s+(\-|rs\d*)\s*(.*)$/) {
      
      # Get the data that was caught by the regexp
      my $gene = $1;
      my $uniprot_id = $2;
      my $rs_id = $4;
      my $phenotype = $5;
      
      # If no rsId was given, will attempt to get one by looking up the Uniprot id in the synonym table
      if ($rs_id ne '-') {
        push(@{$synonym{$rs_id}},$uniprot_id);
      }
    }
  }
  close(IN);
  
  my %result = ('synonyms' => \%synonym);
  return \%result;
}

=head2 add_set_uniprot

add UniProt variation set 

=cut

sub add_set_uniprot {  
  my $db_adaptor      = shift;
  my $set_name        = shift; 
  my $set_description = shift;

  my $attrib_name = 'ph_uniprot';

  # check if UniProt already exists 
  my $stmt = qq{
    SELECT
      variation_set_id
    FROM
      variation_set
    WHERE
      name = '$set_name'
    LIMIT 1
  };

  my $sth = $db_adaptor->dbc->prepare($stmt);
  $sth->execute();
  my $variation_set_id;
  $sth->bind_columns(\$variation_set_id);
  $sth->fetch();

  if (!defined($variation_set_id)) {  
    my $stmt_attrib = qq{
      SELECT
        attrib_id
      FROM
        attrib
      WHERE
        value = '$attrib_name'
      LIMIT 1
    };
    my $sth_attrib = $db_adaptor->dbc->prepare($stmt_attrib);
    $sth_attrib->execute();
    my $attrib_id;
    $sth_attrib->bind_columns(\$attrib_id);
    $sth_attrib->fetch();

    if(!defined($attrib_id)){
      die "Error: attrib '$attrib_name' is not defined in attrib table.\n";
    }

    my $ins_stmt = qq{
      INSERT IGNORE INTO
        variation_set (
        name,
        description,
        short_name_attrib_id
        )
      VALUES (
        '$set_name',
        '$set_description',
        '$attrib_id' 
      )
    };

    $db_adaptor->dbc->do($ins_stmt); 
    $variation_set_id = $db_adaptor->dbc->db_handle->{'mysql_insertid'};
 
  } 
  
  return $variation_set_id; 
}

=head2 add_synonyms_uniprot

import UniProt synonyms;

=cut

sub add_synonyms_uniprot {
  my $synonyms           = shift;
  my $source_name        = shift;
  my $variation_set_id   = shift; 
  my $db_adaptor         = shift;
  my $species            = shift; 

  # If we actually didn't get any synonyms, just return
  return if (!defined($synonyms) || !scalar(keys(%{$synonyms})));
 
  my $variation_adaptor = $db_adaptor->get_VariationAdaptor($species, 'variation', );

  my $stmt_check_var_set = qq{
    SELECT 
      variation_set_id
    FROM 
      variation_feature 
    WHERE 
      variation_id=?
  }; 
 
  my $stmt_update_after_check = qq{
    UPDATE
      variation_feature 
    SET 
      variation_set_id = '$variation_set_id'
    WHERE
      variation_id=?
  }; 

  my $stmt_update = qq{
    UPDATE
      variation_feature 
    SET 
      variation_set_id = CONCAT_WS(',',variation_set_id,'$variation_set_id')  
    WHERE
      variation_id=?
  }; 

  my $check_variation_set_sth = $db_adaptor->dbc->prepare($stmt_check_var_set); 
  my $update_after_check_sth = $db_adaptor->dbc->prepare($stmt_update_after_check);
  my $update_variation_set_sth = $db_adaptor->dbc->prepare($stmt_update); 
  
  foreach my $rs_id (keys %{$synonyms}) {
    my $variation_id = $synonyms->{$rs_id}[0];    

    my $variation = $variation_adaptor->fetch_by_name($rs_id);
    unless($variation){
      warn "Variant $rs_id not found\n"; 
      next; 
    }

    $variation->add_synonym($source_name, @{$synonyms->{$rs_id}}); 
    $variation_adaptor->store_synonyms($variation); 
    
    $check_variation_set_sth->bind_param(1, $variation_id, SQL_VARCHAR);
    $check_variation_set_sth->execute();
    my $var_set_id;
    $check_variation_set_sth->bind_columns(\$var_set_id);
    $check_variation_set_sth->fetch();

    if(!defined($var_set_id)){
      $update_after_check_sth->execute($variation_id);
    }
    else{
      $update_variation_set_sth->execute($variation_id); 
    }
  }

}

=head2 extract_PharmGKB

extract data from PharmGKB file

=cut

sub extract_PharmGKB {

  my $data_file = shift;

  my %synonyms;

  open my $rslist, $data_file ||die "Failed to open synonym list to load: $!\n";
  while(<$rslist>) {
    next if/RSID/;

    my @a = split/\t/;
    $synonyms{$a[0]} = $a[3];
  }
  return \%synonyms;
}

=head2 import_synonyms

import synonyms from refhash and source object;

=cut

sub import_synonyms {

  my $synonyms = shift;
  my $source   = shift;
  my $dba      = shift;
  my $species  = shift;

  my $variation_adaptor = $dba->get_VariationAdaptor($species, 'variation', );

  foreach my $var_name (keys %{$synonyms}) {

    my $var = $variation_adaptor->fetch_by_name($var_name);
    unless($var){
      warn "variant $var_name not found\n";
      next;
    }
    $var->add_synonym($source->name(), $synonyms->{$var_name});
    $variation_adaptor->store_synonyms($var);
  }
}

=head2 import_synonyms_all_sources

import all synonyms from production database  

=cut

sub import_synonyms_all_sources {

  my $sources           = shift;  
  my $dest_db           = shift; 
  my $species           = shift; 
  my $data              = shift;

  my $variation_adaptor = $dest_db->get_VariationAdaptor($species, 'variation');  

  foreach my $synonym_name (keys %{$data}) { 
    my $var_name = $data->{$synonym_name}->{rs_name}; 
    my $var = $variation_adaptor->fetch_by_name($var_name); 

    unless($var) {
      warn "variant $var_name not found\n";
      next; 
    }
  
    my $source_id = $data->{$synonym_name}->{source_id};
    my $source = $sources->{$source_id}; 
    # Clean synonym name before insert in db - remove rsid from name  
    $synonym_name =~ s/.*\s//g; 
    $var->add_synonym($source->name(), $synonym_name);
    $variation_adaptor->store_synonyms($var); 

  } 
} 

=head2 get_source

get or add source object

=cut

sub get_source {

  my $species       = shift;
  my $dba           = shift;
  my $source_name   = shift;
  my $version       = shift;
  my $url           = shift;
  my $description   = shift;
  my $source_status = shift;

  my $source_adaptor = $dba->get_SourceAdaptor('human', 'variation', );
  my $source = $source_adaptor->fetch_by_name($source_name);

  if (defined $source) {
    ## do we need to update the version of an existing source?
    if(defined $version) { 
      $source->version($version);
      $source_adaptor->update_version($source);   
    }
    # Update version with current date 
    else{ 
      $version = localtime->strftime('%Y%m%d');
      $source->version($version);
      $source_adaptor->update_version($source);
    }
  }
  else{
    ## update enter new source
    print "Source information not held for $source_name - adding supplied info\n" unless defined $source ;
    $source = Bio::EnsEMBL::Variation::Source->new
       (-name           => $source_name,
        -url            => $url           || undef,
        -version        => $version       || undef,
        -description    => $description   || undef,
        -somatic_status => $source_status || undef,
        -data_types     => ['variation_synonym']
      );
    eval{
      $source_adaptor->store($source);
    };
    die "ERROR storing source: $@\n" unless $@ eq ''; 
  }
  return $source;
}

=head2 get_all_sources

get sources object 

=cut

sub get_all_sources {

  my $dest_db  = shift;
  my $prod_dbc = shift; 

  # my %source_names; # hash with source id and corresponding source name 
  my %source_list; 

  my $source_adaptor = $dest_db->get_SourceAdaptor('human', 'variation'); 

  # Get sources from production that are linked to human synonym data 
  my $sql_query = "SELECT source_id,name,version,description,url FROM source WHERE source_id IN
                   (SELECT DISTINCT(source_id) FROM human_synonym)";
  my $source_output = query_synonyms_data($prod_dbc, $sql_query);  

  foreach my $result_from_source (@{$source_output}) {  
    # my $source_id = $result_from_source->[0]; 
    my $source_name = $result_from_source->[1]; 
    my $source = $source_adaptor->fetch_by_name($source_name);

    if(!defined($source)) { 
      # get info for source  
      $source = Bio::EnsEMBL::Variation::Source->new
         (-name        => $source_name, 
          -url         => $result_from_source->[4],
          -version     => $result_from_source->[2],
          -description => $result_from_source->[3], 
          -data_types  => ['variation_synonym']
        );

      eval{
        $source_adaptor->store($source);
      };
      die "ERROR storing source: $@\n" unless $@ eq ''; 
    } 
    $source_list{$result_from_source->[0]} = $source; 
  } 

  return \%source_list; 
} 

=head2 query_synonyms_data 

fetch data from production database   

=cut

sub query_synonyms_data {

  my $prod_dbc   = shift;
  my $sql_query  = shift; 

  my $statement = $prod_dbc->prepare($sql_query);  
  $statement->execute(); 
  my $source_output = $statement->fetchall_arrayref();  
  $statement->finish(); 

  return $source_output; 
} 

sub usage{

  die "\nUsage : import_variant_synonyms -registry [registry file] -source_name [name]
\n\tOptions:
\t         -data_file          [name of file to load]
\t         -source_version     [version]
\t         -source_url         [url]
\t         -source_description [longer description]
\t         -species            [species]               - defaults to human
\t         -registry           [registry file]
\t         -db_name            [production db name]
\t         -host               [production db host]
\t         -port               [production db port]
\t         -user               [user]
\t         -pass               [pass]
\t         -clean                                      - remove old data\n\n";
}
